function [U, V, r] = grad_cca(X, Y, alpha, num_iters)
    % Input:
    % X: input matrix of size n x p
    % Y: input matrix of size n x q
    % alpha: learning rate for gradient descent
    % num_iters: number of iterations for gradient descent

    % Output
    % U: matrix of canonical vectors of X, size p x 1
    % V: matrix of canonical vectors of Y, size q x 1
    % r: vector of canonical correlation coefficients

    % Normalization
    X = X - mean(X, 1);
    Y = Y - mean(Y, 1);
    
    % Initialize the canonical vectors
    U = randn(size(X, 2), 1);
    V = randn(size(Y, 2), 1);
    
    % Perform gradient descent
    for i = 1:num_iters
        % Compute the gradients
        dU = loss_grad(X, Y, V);
        dV = loss_grad(Y, X, U);
        
        % Update the canonical vectors and correlation coefficients
        U = U - alpha * dU;
        V = V - alpha * dV;
        
        % Normalize the canonical vectors
        U = U * diag(1./sqrt(sum(U.^2)));
        V = V * diag(1./sqrt(sum(V.^2)));

        % Stop criterion
        if sum(abs(dU)) + sum(abs(dV)) < 1e-4
            break
        end
    end

    r = corr(X*U, Y*V);
    U = sign(r) * U;
    r = abs(r);
end